<!DOCTYPE html>












  


<html class="theme-next gemini use-motion" lang="zh-CN">
<head><meta name="generator" content="Hexo 3.9.0">
  <meta name="baidu-site-verification" content="sc1B5Cxcoz">
  <meta name="google-site-verification" content="OLcRj7lMnauxKrhFT3Q8Nvcexgv22NvlEmVZShKQKR8">
  <meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">


























  

<link rel="stylesheet" href="//cdn.bootcss.com/font-awesome/4.6.2/css/font-awesome.min.css">

<link rel="stylesheet" href="/css/main.css?v=7.0.1">


  <link rel="apple-touch-icon" sizes="180x180" href="/uploads/apple-touch.png?v=7.0.1">


  <link rel="icon" type="image/png" sizes="32x32" href="/uploads/favicon-32x32.png?v=7.0.1">


  <link rel="icon" type="image/png" sizes="16x16" href="/uploads/favicon-16x16.png?v=7.0.1">








<script id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/',
    scheme: 'Gemini',
    version: '7.0.1',
    sidebar: {"position":"right","display":"post","offset":12,"onmobile":false,"dimmer":false},
    back2top: true,
    back2top_sidebar: false,
    fancybox: false,
    fastclick: false,
    lazyload: true,
    tabs: true,
    motion: {"enable":true,"async":true,"transition":{"post_block":"fadeIn","post_header":"fadeIn","post_body":"fadeIn","coll_header":"fadeIn","sidebar":"fadeIn"}},
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>


  




  <meta name="description" content="前言数据挖掘（Data Mining），一般指从海量抓取的数据中经过一定的数据处理、算法，从而提取出有价值的信息的过程。它大体基于统计学、机器学习（Machine Learning）等原理，辅佐了人类的信息处理工作，为人工智能（AI）铺下道路。 幸运的是，似乎正是因数据挖掘而生的那样，Python社区中有各种数据挖掘相关的package，能够满足各种数据处理与算法模型构建需求。我们只需要pip/c">
<meta name="keywords" content="python,数据挖掘,文本分类,scikit-learn,newsgroup">
<meta property="og:type" content="article">
<meta property="og:title" content="【Easy Python】第五话：小试scikit-learn数据挖掘——newsgroup数据处理与文本分类">
<meta property="og:url" content="http://utmhikari.github.io/2019/04/14/easypython/v/index.html">
<meta property="og:site_name" content="HiKariのTechLab">
<meta property="og:description" content="前言数据挖掘（Data Mining），一般指从海量抓取的数据中经过一定的数据处理、算法，从而提取出有价值的信息的过程。它大体基于统计学、机器学习（Machine Learning）等原理，辅佐了人类的信息处理工作，为人工智能（AI）铺下道路。 幸运的是，似乎正是因数据挖掘而生的那样，Python社区中有各种数据挖掘相关的package，能够满足各种数据处理与算法模型构建需求。我们只需要pip/c">
<meta property="og:locale" content="zh-CN">
<meta property="og:updated_time" content="2019-08-19T13:37:51.175Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="【Easy Python】第五话：小试scikit-learn数据挖掘——newsgroup数据处理与文本分类">
<meta name="twitter:description" content="前言数据挖掘（Data Mining），一般指从海量抓取的数据中经过一定的数据处理、算法，从而提取出有价值的信息的过程。它大体基于统计学、机器学习（Machine Learning）等原理，辅佐了人类的信息处理工作，为人工智能（AI）铺下道路。 幸运的是，似乎正是因数据挖掘而生的那样，Python社区中有各种数据挖掘相关的package，能够满足各种数据处理与算法模型构建需求。我们只需要pip/c">






  <link rel="canonical" href="http://utmhikari.github.io/2019/04/14/easypython/v/">



<script id="page.configurations">
  CONFIG.page = {
    sidebar: "",
  };
</script>

  <title>【Easy Python】第五话：小试scikit-learn数据挖掘——newsgroup数据处理与文本分类 | HiKariのTechLab</title>
  












  <noscript>
  <style>
  .use-motion .motion-element,
  .use-motion .brand,
  .use-motion .menu-item,
  .sidebar-inner,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-title { opacity: initial; }

  .use-motion .logo,
  .use-motion .site-title,
  .use-motion .site-subtitle {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line-before i { left: initial; }
  .use-motion .logo-line-after i { right: initial; }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="zh-CN">

  
  
    
  

  <div class="container sidebar-position-right page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta">
    

    <div class="custom-logo-site-title">
      <a href="/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">HiKariのTechLab</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
    
      
        <h1 class="site-subtitle" itemprop="description">光の技术屋</h1>
      
    
    
  </div>

  <div class="site-nav-toggle">
    <button aria-label="切换导航栏">
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>



<nav class="site-nav">
  
    <ul id="menu" class="menu">
      
        
        
        
          
          <li class="menu-item menu-item-home">

    
    
    
      
    

    

    <a href="/" rel="section"><i class="menu-item-icon fa fa-fw fa-home"></i> <br>首页</a>

  </li>
        
        
        
          
          <li class="menu-item menu-item-about">

    
    
    
      
    

    

    <a href="/about/" rel="section"><i class="menu-item-icon fa fa-fw fa-user"></i> <br>关于</a>

  </li>
        
        
        
          
          <li class="menu-item menu-item-tags">

    
    
    
      
    

    

    <a href="/tags/" rel="section"><i class="menu-item-icon fa fa-fw fa-tags"></i> <br>标签</a>

  </li>
        
        
        
          
          <li class="menu-item menu-item-categories">

    
    
    
      
    

    

    <a href="/categories/" rel="section"><i class="menu-item-icon fa fa-fw fa-th"></i> <br>分类</a>

  </li>
        
        
        
          
          <li class="menu-item menu-item-archives">

    
    
    
      
    

    

    <a href="/archives/" rel="section"><i class="menu-item-icon fa fa-fw fa-archive"></i> <br>归档</a>

  </li>
        
        
        
          
          <li class="menu-item menu-item-sitemap">

    
    
    
      
    

    

    <a href="/sitemap.xml" rel="section"><i class="menu-item-icon fa fa-fw fa-sitemap"></i> <br>站点地图</a>

  </li>

      
      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br>搜索</a>
        </li>
      
    </ul>
  

  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocomplete="off" placeholder="搜索..." spellcheck="false" type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



  



</div>
    </header>

    
  
  
  
  

  

  <span class="exturl github-corner" data-url="aHR0cHM6Ly9naXRodWIuY29tL3V0bWhpa2FyaQ==" title="关注HiKariのGithub" aria-label="关注HiKariのGithub"><svg width="80" height="80" viewbox="0 0 250 250" style="fill: #222; color: #fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"/><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"/><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"/></svg></span>



    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          
            

          
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="http://utmhikari.github.io/2019/04/14/easypython/v/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="ひかり.HDQ">
      <meta itemprop="description" content="简约即走心">
      <meta itemprop="image" content="/uploads/avatar.png">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="HiKariのTechLab">
    </span>

    
      <header class="post-header">

        
        
          <h2 class="post-title" itemprop="name headline">【Easy Python】第五话：小试scikit-learn数据挖掘——newsgroup数据处理与文本分类

              
            
          </h2>
        

        <div class="post-meta">
          <span class="post-time">

            
            
            

            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              

              
                
              

              <time title="创建时间：2019-04-14 01:59:24" itemprop="dateCreated datePublished" datetime="2019-04-14T01:59:24+08:00">2019-04-14</time>
            

            
              

              
                
                <span class="post-meta-divider">|</span>
                

                <span class="post-meta-item-icon">
                  <i class="fa fa-calendar-check-o"></i>
                </span>
                
                  <span class="post-meta-item-text">更新于</span>
                
                <time title="修改时间：2019-08-19 21:37:51" itemprop="dateModified" datetime="2019-08-19T21:37:51+08:00">2019-08-19</time>
              
            
          </span>

          
            <span class="post-category">
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing"><a href="/categories/Easy-Python/" itemprop="url" rel="index"><span itemprop="name">Easy Python</span></a></span>

                
                
              
            </span>
          

          
            
            
          

          
          

          
            <span class="post-meta-divider">|</span>
            <span class="post-meta-item-icon">
            <i class="fa fa-eye"></i>
             阅读次数： 
            <span class="busuanzi-value" id="busuanzi_value_page_pv"></span>
            </span>
          

          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <h2 id="前言"><a href="#前言" class="headerlink" title="前言"></a>前言</h2><p>数据挖掘（Data Mining），一般指从海量抓取的数据中经过一定的数据处理、算法，从而提取出有价值的信息的过程。它大体基于统计学、机器学习（Machine Learning）等原理，辅佐了人类的信息处理工作，为人工智能（AI）铺下道路。</p>
<p>幸运的是，似乎正是因数据挖掘而生的那样，Python社区中有各种数据挖掘相关的package，能够满足各种数据处理与算法模型构建需求。我们只需要<code>pip/conda install 包名</code>，然后查查api文档，熬几十行代码，就能玩一玩数据挖掘。</p>
<p>为此，在这一话，我们以自然语言处理（NLP）的文本分类（Text Classification）为例，设计一个最simple的，最old school的，以Python为例的，从数据获取到产生数据挖掘结果的流程。</p>
<h2 id="文本分类是什么？"><a href="#文本分类是什么？" class="headerlink" title="文本分类是什么？"></a>文本分类是什么？</h2><a id="more"></a>
<p>文本分类，更通常的理解，叫文本自动分类（auto-classification），是文本数据挖掘最普通不过的方法了。文本分类应用的例子比比皆是，比如某个新闻网站，爬到了海量外部的新闻文本，当人力不足以将其一个个归类时，就需要借助计算机的力量，将那些没有标注类别（category labelling）新闻自动归类到已有的类别当中。所以，我们的问题就是——怎样像人一样，去识别那些没有归类的新闻的类别呢？</p>
<p>俗话说的好，只要功夫深，铁杵磨成针。我们在孩提时代，是父母告诉我们，这只猫，那是狗，我们才能对不同的动物进行分辨。在数据挖掘领域，我们可以利用分类器（classifier），满足自动分类的需求。分类器就像我们的大脑一样，可以通过吸收不同知识，调整自己的决策，但其本质上，却是一个夹杂了繁复数学计算的计算机程序而已。我们要做的，则是把已有的资源，也就是归类好的那些新闻文本，去告诉分类器，这篇是A类，这篇是B类，从而训练（train）它的新闻类别识别能力。这样，面对各种未归类新闻的考验（test），分类器就可以争取像人那样，把新闻的类别识别出来了。</p>
<p>为此，要保证计算机的识别效果，完备优良的训练材料（training set）和精致缜密的训练方法（algorithm model）都必不可少。</p>
<h2 id="简单的例子——newsgroup文本分类"><a href="#简单的例子——newsgroup文本分类" class="headerlink" title="简单的例子——newsgroup文本分类"></a>简单的例子——newsgroup文本分类</h2><p>以下，我们就开始最简单的文本分类流程示例啦！我们采用<span class="exturl" data-url="aHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS8=" title="https://scikit-learn.org/stable/">scikit-learn<i class="fa fa-external-link"></i></span>提供的工具进行文本分类流程模拟。</p>
<h3 id="newsgroup数据集下载"><a href="#newsgroup数据集下载" class="headerlink" title="newsgroup数据集下载"></a>newsgroup数据集下载</h3><p>文本分类数据集，我们采用最经典的新闻数据集：<span class="exturl" data-url="aHR0cDovL3F3b25lLmNvbS9+amFzb24vMjBOZXdzZ3JvdXBzLw==" title="http://qwone.com/~jason/20Newsgroups/">20 newsgroup数据集<i class="fa fa-external-link"></i></span>进行模拟，使用的版本为<span class="exturl" data-url="aHR0cDovL3F3b25lLmNvbS9+amFzb24vMjBOZXdzZ3JvdXBzLzIwbmV3cy0xODgyOC50YXIuZ3o=" title="http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz">18828版<i class="fa fa-external-link"></i></span>，记录了18828篇不重复的英文新闻。虽然<code>scikit-learn</code>库默认提供该数据集的下载处理，但是在这一话，我们就自己实现一遍吧~</p>
<p>下载，解压，总共有<code>alt.atheism</code>到<code>talk.religion.misc</code>20个类别的文本。打开每一个类别文件夹，能看到以新闻编号为文件名（没有后缀名）的新闻文件。用记事本打开，就能够看到里面的新闻内容啦。</p>
<h3 id="新闻数据读取-amp-预处理"><a href="#新闻数据读取-amp-预处理" class="headerlink" title="新闻数据读取&amp;预处理"></a>新闻数据读取&amp;预处理</h3><p>要模拟文本分类，需要把每一个新闻跟它们的类别一一对应。在<code>scikit-learn</code>中，要实现newsgroup新闻内容与类别的对应，需要建立两个列表：</p>
<ul>
<li>所有新闻的列表</li>
<li>所有新闻归属类别编号的列表（1~n）</li>
</ul>
<p>因此，我们在读取新闻文件内容的时候，也要做一个类别标签的列表。具体代码如下：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> os</span><br><span class="line"></span><br><span class="line"><span class="comment"># out directory to store newsgroup datasset</span></span><br><span class="line">directory = <span class="string">'./20news-18828'</span></span><br><span class="line"><span class="comment"># category_names[label_number - 1] = category name</span></span><br><span class="line">category_names = os.listdir(directory)</span><br><span class="line"><span class="comment"># sequence of news contents --- X</span></span><br><span class="line">news_contents = list()</span><br><span class="line"><span class="comment"># sequence of news labels  --- Y</span></span><br><span class="line">news_labels = list()</span><br><span class="line"><span class="comment"># traverse into directories</span></span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> range(len(category_names)):</span><br><span class="line">    category = category_names[i]</span><br><span class="line">    category_dir = os.path.join(directory, category)</span><br><span class="line">    <span class="keyword">for</span> file_name <span class="keyword">in</span> os.listdir(category_dir):</span><br><span class="line">        file_path = os.path.join(category_dir, file_name)</span><br><span class="line">        <span class="comment"># get the word list of a single news file</span></span><br><span class="line">        raw_content = open(file_path, encoding=<span class="string">'latin1'</span>).read().strip()</span><br><span class="line">        <span class="comment"># preprocess data</span></span><br><span class="line">        news_content = preprocess_content(raw_content)</span><br><span class="line">        <span class="comment"># append news labels and news contents</span></span><br><span class="line">        news_labels.append(i + <span class="number">1</span>)</span><br><span class="line">        news_contents.append(news_content)</span><br></pre></td></tr></table></figure>
<p>其中，<code>news_contents</code>是我们的新闻内容；<code>news_labels</code>是我们每个新闻内容对应类别的编号的列表，跟<code>news_contents</code>一样长；而<code>category_names</code>则是类别名的列表了。我们遍历每个类别目录去读取新闻文件内容，文件编码经查证是<code>latin1</code>。没读到一个文件，我们都去用一个<code>preprocess_content</code>函数预处理（preprocessing）这个文件的内容，然后把文件内容加到<code>news_contents</code>中，把这个新闻对应的类别编号，此处设为<code>索引i + 1</code>，加到<code>news_labels</code>中。</p>
<p>预处理文本方便了我们后续对文本数据的操作。那么，如何预处理newsgroup文本数据呢？这就和新闻文本的数学模型表示方法有关了。把文字堆砌文本变成数学模型，分类器才能够学习不同类别的文本是这样那样的。这个过程，我们叫做拟合（fit）。对于新闻类的长文本来说，最简单粗暴oldschool的方法，就是用词频、关键词之类的信息来表示文本内容。虽然这种方法忽略了词与词之间的上下文关系，但从实践效果来看，已经很ok了。</p>
<p>要获取文本的词频、关键词等信息，就涉及到文本的分词。newsgroup手机的是英文新闻，因此为了让后续的分词更加方便，我们希望在预处理的过程中，<strong>过滤掉新闻文本标点符号之类的干扰字符，把所有单词都以空格相连</strong>，这样就完成了文本的与处理了。</p>
<p>完成这个需求，就需要一个文本预处理的强大武器——<span class="exturl" data-url="aHR0cHM6Ly9kb2NzLnB5dGhvbi5vcmcvMy9saWJyYXJ5L3JlLmh0bWw=" title="https://docs.python.org/3/library/re.html">正则表达式<i class="fa fa-external-link"></i></span>（Regular Expression）。通过正则表达式，我们可以匹配一个样式（pattern）的文本，并对它进行操作。</p>
<p>那么我们的<code>preprocess_content</code>预处理函数，就可以这样写啦：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># replace any character that is not digit or letter or space with empty string</span></span><br><span class="line">replace_with_empty_pattern = re.compile(<span class="string">r'[^A-Za-z0-9\s]'</span>)</span><br><span class="line"><span class="comment"># replace consecutive spaces and enters(\n) with a single space</span></span><br><span class="line">replace_with_single_space_pattern = re.compile(<span class="string">r'\s&#123;2,&#125;|[^\S ]'</span>)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">preprocess_content</span><span class="params">(content)</span>:</span></span><br><span class="line">    <span class="keyword">return</span> re.sub(</span><br><span class="line">        replace_with_single_space_pattern, <span class="string">' '</span>,</span><br><span class="line">        re.sub(replace_with_empty_pattern, <span class="string">''</span>, content)</span><br><span class="line">    )</span><br></pre></td></tr></table></figure>
<p>我们首先去掉标点符号之类的干扰字符，然后把所有的单词都以一个空格相隔。为此，我们做两个正则表达式，完成这个需求吧~</p>
<ul>
<li><code>[^A-Za-z0-9\s]</code>  — 首先，<code>[^嘻哈嘿]</code>代表不匹配<code>嘻哈嘿</code>之类的字符；然后，<code>A-Za-z0-9</code>就顾名思义，代表数字跟字母；最后，<code>\s</code>代表空白字符，包括比如空格(space)啊、回车(enter)啊、制表符（tab）之类。连起来，就是<strong>不匹配数字、字母跟空白字符的那些文本</strong>的意思，也就正好对应我们的干扰字符。我们用<code>re.sub</code>方法，就可以把这些干扰字符替换成空字符串<code>&#39;&#39;</code>，从而去掉它们。</li>
<li><code>\s{2,}|[^\S ]</code> — 首先，<code>\s{2,}</code>表示连续出现两次或以上的空白字符；其次，<code>|</code>代表“或者”的意思，最后，<code>[^\S ]</code>中的<code>\S</code>，代表非空白字符，整一下就表示空白字符里除去空格外（里边多<code>^</code>了一个空格喔）其它所有的空白字符。这样，整个正则表达式就表示——<strong>除了单个空格外，所有空白字符组合成字符串</strong>的情况了。我们把出现这些情况的字符串都用单个空格<code>&#39; &#39;</code>代替，这样所有的单词都以空格相隔了。（诶，其实嫌麻烦的话，直接<code>\s+</code>也成= =）</li>
</ul>
<p><code>print</code>一下试试看吧~</p>
<h3 id="新闻文本分类"><a href="#新闻文本分类" class="headerlink" title="新闻文本分类"></a>新闻文本分类</h3><p>如上所说，要实现一个简单的文本分类流程，就需要准备好训练计算机的数据和用于测试计算机的数据。我们可以把刚刚处理好的新闻内容跟类别标签列表洗刷刷（shuffle），然后分隔一部分用于训练，一部分用于测试。这里，我们把训练跟测试数据集的比重设成1：1先啦~</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> sklearn.model_selection <span class="keyword">import</span> train_test_split</span><br><span class="line"></span><br><span class="line">train_contents, test_contents, train_labels, test_labels = \</span><br><span class="line">    train_test_split(news_contents, news_labels, shuffle=<span class="literal">True</span>, test_size=<span class="number">0.5</span>)</span><br></pre></td></tr></table></figure>
<p>然后，我们需要把我们的新闻，转化成数学模型，从而被分类器识别。按照最简单粗暴的方法，我们可以把整个数据集所有出现过的不同词语整合成一个词表（vocabulary）。然后，针对每一篇文章，都计算<strong>词表里每个词在这篇文章的关键程度</strong>，再一整合，就成了这篇文章的数学模型表示了。</p>
<p>怎样计算词表里每个词在单篇文章的关键程度呢？最老掉牙但又有效的方法，就是通过<span class="exturl" data-url="aHR0cHM6Ly96aC53aWtpcGVkaWEub3JnL3poLWhhbnMvVGYtaWRm" title="https://zh.wikipedia.org/zh-hans/Tf-idf">TF-IDF<i class="fa fa-external-link"></i></span>计算了。TF（Term Frequency）代表这个词在该篇文章出现的频率，IDF（Inverse Document Frequency）代表这个词在整个数据集中地低频程度。一个词，出现在该篇文章次数多，出现在整个数据集次数少，就表示这个词语能够更加突出该篇文章的语义。</p>
<p>为此，我们可以通过<code>scikit-learn</code>内置的<code>TfidfVectorizer</code>，把文章的文本转化为所有词语在该篇文章关键程度的集合，也就是个向量啦~</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> sklearn.feature_extraction.text <span class="keyword">import</span> TfidfVectorizer</span><br><span class="line"></span><br><span class="line">tfidf_vectorizer = TfidfVectorizer()</span><br></pre></td></tr></table></figure>
<p>为了把每篇文章的数学模型进行训练，我们就需要一个分类器。常用的分类器有许多，这里，我们就以SVM（Support Vector Machine，支持向量机）为例啦- -</p>
<p>SVM的分类原理可以用切西瓜来比喻——西瓜里有白的黑的籽，现在不管你刀的形状，怎样来一刀，使得刀两边能够尽量分别是白籽跟黑籽，然后刀到白籽黑籽两者距离的最小值能尽量大呢？比喻说的简单，实际计算还是复杂的（数学渣TAT）。但我们若是单纯引用，则不需要管这些数学问题，直接import就好啦~</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> sklearn.svm <span class="keyword">import</span> LinearSVC</span><br><span class="line"></span><br><span class="line"><span class="comment"># LinearSVC：线性支持向量分类器</span></span><br><span class="line">svm_classifier = LinearSVC(verbose=<span class="literal">True</span>)</span><br></pre></td></tr></table></figure>
<p>在前面的<code>TfidfVectorizer</code>中，我们会先对训练集操作，最后对于每一个新闻生成的数学模型，都是一个维度很高的向量（词表长度= =）。为了能够让我们的SVM分类器训练更加效率，我们可以采用特征选择（feature selection）的方法，在词表中挑选少量来作为每个新闻数学模型的维度就好啦。</p>
<p>卡方统计量（chi2）是常用的特征选择指标。卡方统计量能够衡量词语跟类别的相关性，因此通过卡方指标打分筛选词表中的词语，我们就可以剔除许多影响分类效果的常见词了。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> sklearn.feature_selection <span class="keyword">import</span> SelectKBest, chi2</span><br><span class="line"></span><br><span class="line"><span class="comment"># 选10000个词语作为文档特征词</span></span><br><span class="line">chi2_feature_selector = SelectKBest(chi2, k=<span class="number">10000</span>)</span><br></pre></td></tr></table></figure>
<p>最后，我们需要一个流水线（pipeline），把整个流程串起来——</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> sklearn.pipeline <span class="keyword">import</span> Pipeline</span><br><span class="line"></span><br><span class="line">pipeline = Pipeline(memory=<span class="literal">None</span>, steps=[</span><br><span class="line">    (<span class="string">'tfidf'</span>, tfidf_vectorizer),</span><br><span class="line">    (<span class="string">'chi2'</span>, chi2_feature_selector),</span><br><span class="line">    (<span class="string">'svm'</span>, svm_classifier),</span><br><span class="line">])</span><br></pre></td></tr></table></figure>
<p>对于训练数据，我们先把其转化为以TF-IDF为基础的数学模型，然后通过chi2方法选择特定数量的词语从而剔除干扰词，最后把它输入到SVM分类器中进行训练。</p>
<p>对于测试数据，我们同样先将其转化为TF-IDF为基础的数据，然后在chi2流程时，把训练数据选出的词语应用到测试数据中作为每一个测试文档的特征词（feature），最后再将其输入到SVM分类器中，进行类别预测（predict），也就是“分类”啦。</p>
<p>得到预测结果后，我们可以通过<code>classification_report</code>模块，去展现我们的测试报告。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> sklearn.metrics <span class="keyword">import</span> classification_report</span><br><span class="line"></span><br><span class="line">pipeline.fit(train_contents, train_labels)</span><br><span class="line">result = pipeline.predict(test_contents)</span><br><span class="line">report = classification_report(test_labels, result, target_names=category_names)</span><br></pre></td></tr></table></figure>
<p>试试看吧~</p>
<h2 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h2><p>用python进行newsgroup文本分类，不过是小菜花生。在数据挖掘、机器学习领域，更多的是数学的扎实程度，代码能力并非最为重要。</p>
<p>newsgroup的数据，总共不到两万，算是少之又少。更为海量的数据，要进行数据处理挖掘，光靠个默认的SVM之类也是徒劳，需要更为复杂、更加深层次的模型，好比说神经网络，才能硬刚。</p>
<p>但是很庆幸，我们拥有python，和背后强大的社区。</p>
<p>我想反复说，为什么叫easy python？比python更容易上手的语言很多，lua就是其中一个。但是，lua现在支持那么多数据挖掘需求吗？并没有。</p>
<p>easy python，因为它就如电子琴，能够让我们随心所欲，天马行空。</p>

      
    </div>

    

    
    
    

    
      
<div style="border: 1px solid black; margin-top: 32px; margin-bottom: 32px">
  <div style="margin-left:10px">
    <span style="font-weight:bold">版权声明</span>
    <br>
    <span>本文为博客<a href="https://utmhikari.github.io" style="color:#258FC6">HiKariのTechLab</a>原创文章，转载请标明出处，谢谢~~~</span>
  </div>
</div>


    

    

    
      
    
    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/tags/python/" rel="tag"># python</a>
          
            <a href="/tags/数据挖掘/" rel="tag"># 数据挖掘</a>
          
            <a href="/tags/文本分类/" rel="tag"># 文本分类</a>
          
            <a href="/tags/scikit-learn/" rel="tag"># scikit-learn</a>
          
            <a href="/tags/newsgroup/" rel="tag"># newsgroup</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2019/04/07/luatalk/lfunctimer_capi/" rel="next" title="【Lua杂谈】debug.getinfo源码分析——使用C API重写lfunctimer">
                <i class="fa fa-chevron-left"></i> 【Lua杂谈】debug.getinfo源码分析——使用C API重写lfunctimer
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2019/04/20/luatalk/lnodelist/" rel="prev" title="【Lua杂谈】探索C API，开坑lnodelist">
                【Lua杂谈】探索C API，开坑lnodelist <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>


  </div>


          </div>
          

  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <div class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <img class="site-author-image" itemprop="image" src="/uploads/avatar.png" alt="ひかり.HDQ">
            
              <p class="site-author-name" itemprop="name">ひかり.HDQ</p>
              <div class="site-description motion-element" itemprop="description">简约即走心</div>
          </div>

          
            <nav class="site-state motion-element">
              
                <div class="site-state-item site-state-posts">
                
                  <a href="/archives/">
                
                    <span class="site-state-item-count">60</span>
                    <span class="site-state-item-name">日志</span>
                  </a>
                </div>
              

              
                
                
                <div class="site-state-item site-state-categories">
                  
                    
                      <a href="/categories/">
                    
                  
                    
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                    <span class="site-state-item-count">7</span>
                    <span class="site-state-item-name">分类</span>
                  </a>
                </div>
              

              
                
                
                <div class="site-state-item site-state-tags">
                  
                    
                      <a href="/tags/">
                    
                  
                    
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                    <span class="site-state-item-count">201</span>
                    <span class="site-state-item-name">标签</span>
                  </a>
                </div>
              
            </nav>
          

          

          

          
            <div class="links-of-author motion-element">
              
                <span class="links-of-author-item">
                  
                  
                    
                  
                  
                    
                  
                  <span class="exturl" data-url="aHR0cHM6Ly9naXRodWIuY29tL3V0bWhpa2FyaQ==" title="GitHub &rarr; https://github.com/utmhikari"><i class="fa fa-fw fa-github"></i>GitHub</span>
                </span>
              
                <span class="links-of-author-item">
                  
                  
                    
                  
                  
                    
                  
                  <span class="exturl" data-url="bWFpbHRvOmhkcTk0MDYyM0AxMjYuY29t" title="Mail &rarr; mailto:hdq940623@126.com"><i class="fa fa-fw fa-envelope"></i>Mail</span>
                </span>
              
                <span class="links-of-author-item">
                  
                  
                    
                  
                  
                    
                  
                  <span class="exturl" data-url="aHR0cHM6Ly90ZXN0ZXJob21lLmNvbS91dG1oaWthcmk=" title="TesterHome &rarr; https://testerhome.com/utmhikari"><i class="fa fa-fw fa-tumblr"></i>TesterHome</span>
                </span>
              
                <span class="links-of-author-item">
                  
                  
                    
                  
                  
                    
                  
                  <span class="exturl" data-url="aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3UwMTM4NDI1MDE=" title="CSDN &rarr; https://blog.csdn.net/u013842501"><i class="fa fa-fw fa-copyright"></i>CSDN</span>
                </span>
              
                <span class="links-of-author-item">
                  
                  
                    
                  
                  
                    
                  
                  <span class="exturl" data-url="aHR0cHM6Ly9zdGVhbWNvbW11bml0eS5jb20vaWQvdXRtaGlrYXJp" title="Steam &rarr; https://steamcommunity.com/id/utmhikari"><i class="fa fa-fw fa-steam"></i>Steam</span>
                </span>
              
                <span class="links-of-author-item">
                  
                  
                    
                  
                  
                    
                  
                  <span class="exturl" data-url="aHR0cHM6Ly9zcGFjZS5iaWxpYmlsaS5jb20vMTUwNzUxMTQv" title="Bilibili &rarr; https://space.bilibili.com/15075114/"><i class="fa fa-fw fa-bold"></i>Bilibili</span>
                </span>
              
            </div>
          

          

          
          

          
            
          
          

        </div>
      </div>

      
      <!--noindex-->
        <div class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
            
            
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#前言"><span class="nav-number">1.</span> <span class="nav-text">前言</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#文本分类是什么？"><span class="nav-number">2.</span> <span class="nav-text">文本分类是什么？</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#简单的例子——newsgroup文本分类"><span class="nav-number">3.</span> <span class="nav-text">简单的例子——newsgroup文本分类</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#newsgroup数据集下载"><span class="nav-number">3.1.</span> <span class="nav-text">newsgroup数据集下载</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#新闻数据读取-amp-预处理"><span class="nav-number">3.2.</span> <span class="nav-text">新闻数据读取&amp;预处理</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#新闻文本分类"><span class="nav-number">3.3.</span> <span class="nav-text">新闻文本分类</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#总结"><span class="nav-number">4.</span> <span class="nav-text">总结</span></a></li></ol></div>
            

          </div>
        </div>
      <!--/noindex-->
      

      

    </div>
  </aside>
  


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">&copy; 2019 – <span itemprop="copyrightYear">2020</span>
  <span class="with-love" id="animate">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">ひかり.HDQ</span>

  

  
</div>









        
<div class="busuanzi-count">
  <script async src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>

  
    <span class="post-meta-item-icon">
      <i class="fa fa-user"></i>
    </span>
    <span class="site-uv" title="总访客量">
      <span class="busuanzi-value" id="busuanzi_value_site_uv"></span>
    </span>
  

  
    <span class="post-meta-divider">|</span>
  

  
    <span class="post-meta-item-icon">
      <i class="fa fa-eye"></i>
    </span>
    <span class="site-pv" title="总访问量">
      <span class="busuanzi-value" id="busuanzi_value_site_pv"></span>
    </span>
  
</div>









        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    

    

    
  </div>

  

<script>
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>







  




















  
  <script src="//cdn.bootcss.com/jquery/2.1.3/jquery.min.js"></script>

  
  <script src="//cdn.bootcss.com/jquery_lazyload/1.9.7/jquery.lazyload.min.js"></script>

  
  <script src="//cdn.bootcss.com/velocity/1.2.1/velocity.min.js"></script>

  
  <script src="//cdn.bootcss.com/velocity/1.2.1/velocity.ui.min.js"></script>


  


  <script src="/js/src/utils.js?v=7.0.1"></script>

  <script src="/js/src/motion.js?v=7.0.1"></script>



  
  


  <script src="/js/src/affix.js?v=7.0.1"></script>

  <script src="/js/src/schemes/pisces.js?v=7.0.1"></script>




  
  <script src="/js/src/scrollspy.js?v=7.0.1"></script>
<script src="/js/src/post-details.js?v=7.0.1"></script>



  


  <script src="/js/src/next-boot.js?v=7.0.1"></script>


  

  
  <script src="/js/src/exturl.js?v=7.0.1"></script>


  

  


  


  
  <script>
    // Popup Window;
    var isfetched = false;
    var isXml = true;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length === 0) {
      search_path = "search.xml";
    } else if (/json$/i.test(search_path)) {
      isXml = false;
    }
    var path = "/" + search_path;
    // monitor main search box;

    var onPopupClose = function (e) {
      $('.popup').hide();
      $('#local-search-input').val('');
      $('.search-result-list').remove();
      $('#no-result').remove();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    }

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.search-popup-overlay').click(onPopupClose);
      $('.popup').toggle();
      var $localSearchInput = $('#local-search-input');
      $localSearchInput.attr("autocapitalize", "none");
      $localSearchInput.attr("autocorrect", "off");
      $localSearchInput.focus();
    }

    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';

      // start loading animation
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay">' +
          '<div id="search-loading-icon">' +
          '<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
          '</div>' +
          '</div>')
        .css('overflow', 'hidden');
      $("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');

      

      $.ajax({
        url: path,
        dataType: isXml ? "xml" : "json",
        async: true,
        success: function(res) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = isXml ? $("entry", res).map(function() {
            return {
              title: $("title", this).text(),
              content: $("content",this).text(),
              url: $("url" , this).text()
            };
          }).get() : res;
          var input = document.getElementById(search_id);
          var resultContent = document.getElementById(content_id);
          var inputEventFunction = function() {
            var searchText = input.value.trim().toLowerCase();
            var keywords = searchText.split(/[\s\-]+/);
            if (keywords.length > 1) {
              keywords.push(searchText);
            }
            var resultItems = [];
            if (searchText.length > 0) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var hitCount = 0;
                var searchTextCount = 0;
                var title = data.title.trim();
                var titleInLowerCase = title.toLowerCase();
                var content = data.content.trim().replace(/<[^>]+>/g,"");
                
                var contentInLowerCase = content.toLowerCase();
                var articleUrl = decodeURIComponent(data.url).replace(/\/{2,}/g, '/');
                var indexOfTitle = [];
                var indexOfContent = [];
                // only match articles with not empty titles
                if(title != '') {
                  keywords.forEach(function(keyword) {
                    function getIndexByWord(word, text, caseSensitive) {
                      var wordLen = word.length;
                      if (wordLen === 0) {
                        return [];
                      }
                      var startPosition = 0, position = [], index = [];
                      if (!caseSensitive) {
                        text = text.toLowerCase();
                        word = word.toLowerCase();
                      }
                      while ((position = text.indexOf(word, startPosition)) > -1) {
                        index.push({position: position, word: word});
                        startPosition = position + wordLen;
                      }
                      return index;
                    }

                    indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
                    indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
                  });
                  if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
                    isMatch = true;
                    hitCount = indexOfTitle.length + indexOfContent.length;
                  }
                }

                // show search results

                if (isMatch) {
                  // sort index by position of keyword

                  [indexOfTitle, indexOfContent].forEach(function (index) {
                    index.sort(function (itemLeft, itemRight) {
                      if (itemRight.position !== itemLeft.position) {
                        return itemRight.position - itemLeft.position;
                      } else {
                        return itemLeft.word.length - itemRight.word.length;
                      }
                    });
                  });

                  // merge hits into slices

                  function mergeIntoSlice(text, start, end, index) {
                    var item = index[index.length - 1];
                    var position = item.position;
                    var word = item.word;
                    var hits = [];
                    var searchTextCountInSlice = 0;
                    while (position + word.length <= end && index.length != 0) {
                      if (word === searchText) {
                        searchTextCountInSlice++;
                      }
                      hits.push({position: position, length: word.length});
                      var wordEnd = position + word.length;

                      // move to next position of hit

                      index.pop();
                      while (index.length != 0) {
                        item = index[index.length - 1];
                        position = item.position;
                        word = item.word;
                        if (wordEnd > position) {
                          index.pop();
                        } else {
                          break;
                        }
                      }
                    }
                    searchTextCount += searchTextCountInSlice;
                    return {
                      hits: hits,
                      start: start,
                      end: end,
                      searchTextCount: searchTextCountInSlice
                    };
                  }

                  var slicesOfTitle = [];
                  if (indexOfTitle.length != 0) {
                    slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
                  }

                  var slicesOfContent = [];
                  while (indexOfContent.length != 0) {
                    var item = indexOfContent[indexOfContent.length - 1];
                    var position = item.position;
                    var word = item.word;
                    // cut out 100 characters
                    var start = position - 20;
                    var end = position + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if (end < position + word.length) {
                      end = position + word.length;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
                  }

                  // sort slices in content by search text's count and hits' count

                  slicesOfContent.sort(function (sliceLeft, sliceRight) {
                    if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
                      return sliceRight.searchTextCount - sliceLeft.searchTextCount;
                    } else if (sliceLeft.hits.length !== sliceRight.hits.length) {
                      return sliceRight.hits.length - sliceLeft.hits.length;
                    } else {
                      return sliceLeft.start - sliceRight.start;
                    }
                  });

                  // select top N slices in content

                  var upperBound = parseInt('5');
                  if (upperBound >= 0) {
                    slicesOfContent = slicesOfContent.slice(0, upperBound);
                  }

                  // highlight title and content

                  function highlightKeyword(text, slice) {
                    var result = '';
                    var prevEnd = slice.start;
                    slice.hits.forEach(function (hit) {
                      result += text.substring(prevEnd, hit.position);
                      var end = hit.position + hit.length;
                      result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
                      prevEnd = end;
                    });
                    result += text.substring(prevEnd, slice.end);
                    return result;
                  }

                  var resultItem = '';

                  if (slicesOfTitle.length != 0) {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
                  } else {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
                  }

                  slicesOfContent.forEach(function (slice) {
                    resultItem += "<a href='" + articleUrl + "'>" +
                      "<p class=\"search-result\">" + highlightKeyword(content, slice) +
                      "...</p>" + "</a>";
                  });

                  resultItem += "</li>";
                  resultItems.push({
                    item: resultItem,
                    searchTextCount: searchTextCount,
                    hitCount: hitCount,
                    id: resultItems.length
                  });
                }
              })
            };
            if (keywords.length === 1 && keywords[0] === "") {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x"></i></div>'
            } else if (resultItems.length === 0) {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x"></i></div>'
            } else {
              resultItems.sort(function (resultLeft, resultRight) {
                if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
                  return resultRight.searchTextCount - resultLeft.searchTextCount;
                } else if (resultLeft.hitCount !== resultRight.hitCount) {
                  return resultRight.hitCount - resultLeft.hitCount;
                } else {
                  return resultRight.id - resultLeft.id;
                }
              });
              var searchResultList = '<ul class=\"search-result-list\">';
              resultItems.forEach(function (result) {
                searchResultList += result.item;
              })
              searchResultList += "</ul>";
              resultContent.innerHTML = searchResultList;
            }
          }

          if ('auto' === 'manual') {
            input.addEventListener('input', inputEventFunction);
          } else {
            $('.search-icon').click(inputEventFunction);
            input.addEventListener('keypress', function (event) {
              if (event.keyCode === 13) {
                inputEventFunction();
              }
            });
          }

          // remove loading animation
          $(".local-search-pop-overlay").remove();
          $('body').css('overflow', '');

          proceedsearch();
        }
      });
    }

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched === false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(onPopupClose);
    $('.popup').click(function(e){
      e.stopPropagation();
    });
    $(document).on('keyup', function (event) {
      var shouldDismissSearchPopup = event.which === 27 &&
        $('.search-popup').is(':visible');
      if (shouldDismissSearchPopup) {
        onPopupClose();
      }
    });
  </script>





  

  

  

  

  
<script>
if ($('body').find('pre.mermaid').length) {
  $.ajax({
    type: 'GET',
    url: '//cdn.bootcss.com/mermaid/8.0.0-rc.8/mermaid.min.js',
    dataType: 'script',
    cache: true,
    success: function() {
      mermaid.initialize({
        theme: 'dark',
        logLevel: 3,
        flowchart: { curve: 'linear' },
        gantt: { axisFormat: '%m/%d/%Y' },
        sequence: { actorMargin: 50 }
      });
    }
  });
}
</script>


  
  <script>
    (function(){
        var bp = document.createElement('script');
        var curProtocol = window.location.protocol.split(':')[0];
        if (curProtocol === 'https') {
            bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
        }
        else {
            bp.src = 'http://push.zhanzhang.baidu.com/push.js';
        }
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(bp, s);
    })();
  </script>


  

  

  

  

  

  
<script>
  $('.highlight').each(function(i, e) {
    var $wrap = $('<div>').addClass('highlight-wrap');
    $(e).after($wrap);
    $wrap.append($('<button>').addClass('copy-btn').append('复制').on('click', function(e) {
      var code = $(this).parent().find('.code').find('.line').map(function(i, e) {
        return $(e).text();
      }).toArray().join('\n');
      var ta = document.createElement('textarea');
      var yPosition = window.pageYOffset || document.documentElement.scrollTop;
      ta.style.top = yPosition + 'px'; // Prevent page scroll
      ta.style.position = 'absolute';
      ta.style.opacity = '0';
      ta.readOnly = true;
      ta.value = code;
      document.body.appendChild(ta);
      ta.select();
      ta.setSelectionRange(0, code.length);
      ta.readOnly = false;
      var result = document.execCommand('copy');
      
        if (result) $(this).text('复制成功');
        else $(this).text('复制失败');
      
      ta.blur(); // For iOS
      $(this).blur();
    })).on('mouseleave', function(e) {
      var $b = $(this).find('.copy-btn');
      setTimeout(function() {
        $b.text('复制');
      }, 300);
    }).append(e);
  })
</script>


  

  

</body>
</html>
